By observing loyalty and credit card data, we can observe that the top three most visited places during the 2 weeks data are:
Katerina’s Cafe Employee usually go to Katerina’s Cafe on
Hippokampos Employee usually go to Hippokampos on
Guy’s Gyros Employee usually go to Guy’s Gyros Cafe on
##Anomalies
As we can see in the figure below, the number of daily frequency from credit card data and loyalty data are different. Based on credit card data the Katerina’s Cafe is most visited in 2014-06-01 with total of … transactions. However, if we observed loyalty data Katerina’s Cafe is most visited in … with total of … transactions. This problem also happened to other shops or location that we are observing.
There are total of … un-matched records. These un-matched records might lead to some new clues.
To correct this anomaly, we can add vehicle data to our analysis, we might be able to determine some of the transactions based on the employee location.
cc_freq <- group_by(cc_data, location) %>% summarise(freq = n())%>% arrange(desc(freq))
cc_freq
## # A tibble: 34 x 2
## location freq
## <chr> <int>
## 1 Katerina's Cafe 212
## 2 Hippokampos 171
## 3 Guy's Gyros 158
## 4 Brew've Been Served 156
## 5 Hallowed Grounds 92
## 6 Ouzeri Elian 87
## 7 Abila Zacharo 72
## 8 Gelatogalore 64
## 9 Kalami Kafenion 64
## 10 Bean There Done That 47
## # ... with 24 more rows
loyalty_freq <- group_by(loyalty_data, location) %>% summarise(freq = n())%>% arrange(desc(freq))
loyalty_freq
## # A tibble: 33 x 2
## location freq
## <chr> <int>
## 1 Katerina's Cafe 195
## 2 Hippokampos 155
## 3 Guy's Gyros 146
## 4 Brew've Been Served 140
## 5 Ouzeri Elian 84
## 6 Hallowed Grounds 80
## 7 Abila Zacharo 71
## 8 Kalami Kafenion 66
## 9 Gelatogalore 60
## 10 Bean There Done That 45
## # ... with 23 more rows
cc_freq <- group_by(cc_data, location, day, hours) %>% summarise(freq = n())%>% arrange(desc(freq))
cc_freq
## # A tibble: 377 x 4
## # Groups: location, day [162]
## location day hours freq
## <chr> <chr> <int> <int>
## 1 Brew've Been Served Tuesday 7 22
## 2 Brew've Been Served Wednesday 7 20
## 3 Hippokampos Thursday 13 20
## 4 Brew've Been Served Monday 7 18
## 5 Brew've Been Served Thursday 7 18
## 6 Katerina's Cafe Tuesday 20 18
## 7 Katerina's Cafe Saturday 19 17
## 8 Brew've Been Served Friday 8 16
## 9 Hallowed Grounds Wednesday 7 16
## 10 Brew've Been Served Friday 7 15
## # ... with 367 more rows
loyalty_freq <- group_by(loyalty_data, location) %>% summarise(freq = n())%>% arrange(desc(freq))
loyalty_freq
## # A tibble: 33 x 2
## location freq
## <chr> <int>
## 1 Katerina's Cafe 195
## 2 Hippokampos 155
## 3 Guy's Gyros 146
## 4 Brew've Been Served 140
## 5 Ouzeri Elian 84
## 6 Hallowed Grounds 80
## 7 Abila Zacharo 71
## 8 Kalami Kafenion 66
## 9 Gelatogalore 60
## 10 Bean There Done That 45
## # ... with 23 more rows
cc_freq_daily <- group_by(cc_data, timestamp_date, location) %>% summarise(freq = n())%>% arrange(desc(freq))
top3_place = c("Katerina's Cafe", "Hippokampos","Guy's Gyros")
cc_freq_daily_selected <- cc_freq_daily %>% filter(location %in% top3_place)
top3_vis_cc <- ggplot(cc_freq_daily_selected, aes(fill=as.character(timestamp_date), y=freq, x=location)) +
geom_bar(position="stack", stat="identity") +
geom_text(label = cc_freq_daily_selected$freq, position=position_stack(vjust=0.5))+
ggtitle("CC data top 3 location")
ggplotly(top3_vis_cc)
loyalty_freq_daily <- group_by(loyalty_data, timestamp, location) %>% summarise(freq = n())%>% arrange(desc(freq))
top3_place = c("Katerina's Cafe", "Hippokampos","Guy's Gyros")
loyalty_freq_daily_selected <- loyalty_freq_daily %>% filter(location %in% top3_place)
top3_vis_L <- ggplot(loyalty_freq_daily_selected, aes(fill=as.character(timestamp), y=freq, x=location)) +
geom_bar(position="stack", stat="identity") +
geom_text(label = loyalty_freq_daily_selected$freq, position=position_stack(vjust=0.5))+
ggtitle("Loyalty data top 3 location")
ggplotly(top3_vis_L)
##Question 2
gps_data <- read_csv("MC2/gps.csv")
gps_data$Timestamp_date <- date_time_parse(gps_data$Timestamp,
zone = "",
format = "%m/%d/%Y")
gps_data$Timestamp <- date_time_parse(gps_data$Timestamp,
zone = "",
format = "%m/%d/%Y %H:%M")
gps_data$Time <- format(gps_data$Timestamp, format = "%H:%M")
gps_data$Hour <- get_hour(gps_data$Timestamp)
gps_sort <- gps_data[order(gps_data$Timestamp, gps_data$id),]
stop_fin_2 = data.frame()
for (id_1 in unique(gps_sort$id))
{
make_stop = data.frame()
gps_id <- gps_sort %>%
filter(id==id_1)
for(i in 1:nrow(gps_id))
{
if(i < nrow(gps_id))
{
start <- gps_id$Timestamp[i]
end <- gps_id$Timestamp[i+1]
} else
{
start <- gps_id$Timestamp[i]
end <- gps_id$Timestamp[i]
print(start)
}
elapsed.time <- start%--%end
time_duration <- as.duration(elapsed.time)
if(time_duration > 300)
{
make_stop = rbind(make_stop,cbind(gps_id[i,],time_duration,i),cbind(gps_id[i+1,],time_duration,i))
}
}
stop_fin_2 = rbind(stop_fin_2, make_stop)
}
## [1] "2014-01-19 20:07:00 +08"
## [1] "2014-01-19 16:06:00 +08"
## [1] "2014-01-19 19:48:00 +08"
## [1] "2014-01-19 19:47:00 +08"
## [1] "2014-01-19 20:11:00 +08"
## [1] "2014-01-19 19:47:00 +08"
## [1] "2014-01-19 20:24:00 +08"
## [1] "2014-01-19 15:57:00 +08"
## [1] "2014-01-19 19:53:00 +08"
## [1] "2014-01-19 20:21:00 +08"
## [1] "2014-01-19 20:38:00 +08"
## [1] "2014-01-19 19:58:00 +08"
## [1] "2014-01-19 20:02:00 +08"
## [1] "2014-01-18 15:52:00 +08"
## [1] "2014-01-19 19:49:00 +08"
## [1] "2014-01-19 14:11:00 +08"
## [1] "2014-01-19 19:56:00 +08"
## [1] "2014-01-19 18:25:00 +08"
## [1] "2014-01-19 19:49:00 +08"
## [1] "2014-01-19 20:09:00 +08"
## [1] "2014-01-19 20:20:00 +08"
## [1] "2014-01-18 20:21:00 +08"
## [1] "2014-01-19 20:27:00 +08"
## [1] "2014-01-19 14:32:00 +08"
## [1] "2014-01-19 20:03:00 +08"
## [1] "2014-01-16 17:50:00 +08"
## [1] "2014-01-19 20:38:00 +08"
## [1] "2014-01-19 19:46:00 +08"
## [1] "2014-01-19 20:04:00 +08"
## [1] "2014-01-19 20:56:00 +08"
## [1] "2014-01-19 20:36:00 +08"
## [1] "2014-01-19 20:11:00 +08"
## [1] "2014-01-19 19:54:00 +08"
## [1] "2014-01-19 19:46:00 +08"
## [1] "2014-01-19 20:25:00 +08"
## [1] "2014-01-17 17:21:00 +08"
## [1] "2014-01-16 21:06:00 +08"
## [1] "2014-01-16 19:07:00 +08"
## [1] "2014-01-16 20:00:00 +08"
## [1] "2014-01-19 19:50:00 +08"
elapsed.time
## [1] 2014-01-19 19:50:00 +08--2014-01-19 19:50:00 +08
location_point = read_csv("MC2/stop_fin.csv") %>% select(lat,long,Possible_Location)%>% unique()
stop_fin_2$lat = round(stop_fin_2$lat,5)
stop_fin_2$long = round(stop_fin_2$long,5)
stop_final = merge(stop_fin_2,location_point, by.x = c("lat","long"), by.y = c("lat","long"), all.x = TRUE)
stop_fin <- stop_final
stop_fin_location <- stop_fin[!(stop_fin$Possible_Location %like% 'mice'),]
gps_sf_stopfin <- st_as_sf(stop_fin_location,
coords = c("long","lat"),
crs=4326)
gps_sf_stopfin_unique1 <- gps_sf_stopfin %>% select(Possible_Location,geometry)%>%unique()%>%arrange(Possible_Location)
gps_sf_stopfin_unique <- gps_sf_stopfin_unique1[-c(221,476,644,1100,1103),]
gps_path_stopfin <- gps_sf_stopfin_unique %>%
group_by(Possible_Location) %>%
summarize(m_Timestamp = mean(geometry),
do_union = FALSE ) %>%
st_cast("LINESTRING")
Abila_st <- st_read(dsn = "MC2/Geospatial",
layer = "Abila")
## Reading layer `Abila' from data source `C:\jovinkahartanto\assignment_distill\MC2\Geospatial' using driver `ESRI Shapefile'
## Simple feature collection with 3290 features and 9 fields
## Geometry type: LINESTRING
## Dimension: XY
## Bounding box: xmin: 24.82401 ymin: 36.04502 xmax: 24.90997 ymax: 36.09492
## Geodetic CRS: WGS 84
P <- npts(Abila_st, by_feature = TRUE)
abila_st_2 <- cbind(Abila_st,P) %>% filter(P > 1)
bgmap <- raster("MC2/Geospatial/MC2-tourist.TIF")
tmap_mode("view")
tm_shape(bgmap) +
tm_rgb(bgmap, r = 1, g = 2, b = 3,
alpha = NA,
saturation = 1,
interpolate = TRUE,
max.value = 255) +
tm_shape(abila_st_2) +
tm_lines(col = "red", scale = 1)+
tm_shape(gps_path_stopfin[9,]) +
tm_lines(col = "blue", scale =4, interactive = TRUE) +
tm_text("Possible_Location", size = 3, remove.overlap = TRUE, overwrite.lines = TRUE, just = "top")
gps_data <- read_csv("MC2/gps.csv")
gps_data$Timestamp_date <- date_time_parse(gps_data$Timestamp,
zone = "",
format = "%m/%d/%Y")
gps_data$Timestamp <- date_time_parse(gps_data$Timestamp,
zone = "",
format = "%m/%d/%Y %H:%M")
gps_data$Time <- format(gps_data$Timestamp, format = "%H:%M")
gps_data$Hour <- get_hour(gps_data$Timestamp)
gps_sf <- st_as_sf(gps_data,
coords = c("long","lat"),
crs=4326)
gps_path <- gps_sf %>%
group_by(id, Timestamp_date,Hour) %>%
summarize(m_Timestamp = mean(Timestamp_date),
do_union = FALSE ) %>%
st_cast("LINESTRING")
Abila_st <- st_read(dsn = "MC2/Geospatial",
layer = "Abila")
## Reading layer `Abila' from data source `C:\jovinkahartanto\assignment_distill\MC2\Geospatial' using driver `ESRI Shapefile'
## Simple feature collection with 3290 features and 9 fields
## Geometry type: LINESTRING
## Dimension: XY
## Bounding box: xmin: 24.82401 ymin: 36.04502 xmax: 24.90997 ymax: 36.09492
## Geodetic CRS: WGS 84
P <- npts(Abila_st, by_feature = TRUE)
abila_st_2 <- cbind(Abila_st,P) %>% filter(P > 1)
gps_path_selected <- gps_path %>%
filter(id==28)
bgmap <- raster("MC2/Geospatial/MC2-tourist.TIF")
tmap_mode("view")
tm_shape(bgmap) +
tm_rgb(bgmap, r = 1, g = 2, b = 3,
alpha = NA,
saturation = 1,
interpolate = TRUE,
max.value = 255) +
tm_shape(abila_st_2) +
tm_lines(col = "red", scale = 1)+
tm_shape(gps_path_selected[1:4,]) +
tm_lines(col = "black", scale =4, interactive = TRUE) +
tm_text("Hour", size = 3, remove.overlap = TRUE, overwrite.lines = TRUE, just = "top")